Create a set of functions that can be used together to segment satellite images into similar regions using k-means clustering, and then create and apply a color mask to areas of water. Specifically:
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
from skimage.io import imread
from skimage.filters import gaussian
from sklearn.cluster import KMeans
In [3]:
image = imread('./Segmentation/sat4.jpg')
plt.imshow(image);
In [4]:
# part 1
filtered = gaussian(image, sigma=6, multichannel=True)
plt.imshow(filtered);
In [5]:
# part 2
def pre_process(img, sigma, multichannel=True):
return gaussian(img, sigma=sigma, multichannel=multichannel)
In [6]:
# part 3
def segmentation(img, nclusters, multichannel=True):
# get image dimensions
imheight = img.shape[0]
imwidth = img.shape[1]
if multichannel:
immulti = img.shape[2]
else:
immulti = 1
# reshape image
img_reshape = img.ravel().reshape(imwidth*imheight, immulti)
# fit kmeans
kmeans = KMeans(n_clusters=nclusters)
kmeans.fit(img_reshape)
return kmeans.labels_, kmeans.inertia_
def show_segmentation(img, labels):
# get image dimensions
imheight = img.shape[0]
imwidth = img.shape[1]
# create grid
X, Y = np.meshgrid(np.arange(imwidth), np.arange(imheight))
# scatterplot
fig, ax = plt.subplots()
ax.scatter(x=X.ravel(), y=Y.ravel(), c=labels)
ax.set_ylim(imheight-1, 0)
return fig
In [7]:
imlabels, inertia = segmentation(filtered, nclusters=5)
In [8]:
plt.imshow(image);
In [9]:
show_segmentation(image, imlabels);
In [10]:
# part 4
def parameter_selection(img, nclusters, sigmas, multichannel=True, verbose=False):
inertias = {}
all_labels = {}
for s in sigmas:
filtered = pre_process(img, sigma=s, multichannel=multichannel)
for n in nclusters:
labels, inertia = segmentation(filtered, nclusters=n, multichannel=multichannel)
inertias[(s, n)] = inertia
all_labels[(s, n)] = labels
if verbose:
print('sigma={} and {} clusters trained.'.format(s, n))
# get image dimensions
imheight = img.shape[0]
imwidth = img.shape[1]
# create grid
X, Y = np.meshgrid(np.arange(imwidth), np.arange(imheight))
# scatterplot
fig, ax = plt.subplots(3, 2, figsize=(10, 16))
for k, key in enumerate(all_labels):
j = k%2
i = int((k-j)/2)
ax[i, j].scatter(x=X.ravel(), y=Y.ravel(), c=all_labels[key])
ax[i, j].set_ylim(imheight-1, 0)
ax[i, j].set_title('$\sigma={}$ and {} clusters'.format(key[0], key[1]));
return all_labels, inertias, fig
In [11]:
nclusters = [4, 5, 6]
sigmas = [3, 6]
labels, inertias, fig = parameter_selection(image, nclusters=nclusters, sigmas=sigmas, verbose=True)
In [12]:
inertias
Out[12]:
In [43]:
# part 6
# sigma = 6 and 4 clusters
params = (6, 4)
imheight = image.shape[0]
imwidth = image.shape[1]
lab = 1
# create grid
X, Y = np.meshgrid(np.arange(imwidth), np.arange(imheight))
plt.imshow(image)
plt.scatter(x=X.ravel()[labels[params]==lab], y=Y.ravel()[labels[params]==lab], c='b');
The dataset provided can be summarized as follows:
In 2013, students of the Statistics class at FSEV UK were asked to invite their friends to participate in this survey.
Your task is to use agglomerative (hierarchical) cluster analysis to analyze this data. Specifically:
hierarchy.fcluster
method to get cluster labels for 16 clusters from the data
In [14]:
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
In [15]:
# part 1
responses = pd.read_csv('./Targeted/responses.csv')
# new gender column, 0=female 1=male
responses['gender'] = responses['Gender'].apply(lambda x: 0 if x=='female' else 1)
# drop categoricals
responses.drop(responses.columns[responses.dtypes=='object'], axis=1, inplace=True)
# drop rows with nas
responses.dropna(inplace=True)
In [16]:
# part 2
X = responses.drop('gender', axis=1).values
link = linkage(X, 'ward')
In [17]:
plt.figure(figsize=(16, 12))
ddata = dendrogram(link, no_labels=True, color_threshold=60)
plt.title('Dendrogram');
In [18]:
def get_ncluster(ddata, threshold):
dcoords = np.array(ddata['dcoord']) # coordinates of vertical lines: bottom left, upper left, bottom right, upper right
left_clusters = np.sum((dcoords[:, 0] <= threshold) & (dcoords[:, 1] >= threshold))
right_clusters = np.sum((dcoords[:, 3] <= threshold) & (dcoords[:, 2] >= threshold))
return left_clusters + right_clusters
In [19]:
plt.figure(figsize=(16, 12))
ddata = dendrogram(link, no_labels=True, color_threshold=60)
nclust = get_ncluster(ddata, 60)
plt.axhline(y=60, linestyle='--')
plt.title('Dendrogram, {} clusters'.format(nclust));
In [20]:
# part 3
plt.figure(figsize=(16, 12))
ddata = dendrogram(link, p=12, color_threshold=60, truncate_mode='lastp', show_leaf_counts=True)
plt.title('Dendrogram');
In [21]:
# part 4
responses['label16'] = fcluster(link, 16, criterion='maxclust')
In [22]:
female = responses.loc[responses['gender']==0, 'label16'].value_counts().sort_index()
male = responses.loc[responses['gender']==1, 'label16'].value_counts().sort_index()
plt.figure(figsize=(16, 8))
plt.bar(np.arange(len(female)) + 0.2, female, width=0.4, alpha=0.5, label='female')
plt.bar(np.arange(len(male)) - 0.2, male, width=0.4, alpha=0.5, label='male')
plt.legend();
In [23]:
responses['label2'] = fcluster(link, 2, criterion='maxclust')
In [24]:
female = responses.loc[responses['gender']==0, 'label2'].value_counts().sort_index()
male = responses.loc[responses['gender']==1, 'label2'].value_counts().sort_index()
plt.figure(figsize=(8, 8))
plt.bar(np.arange(len(female)) + 0.2, female, width=0.4, alpha=0.5, label='female')
plt.bar(np.arange(len(male)) - 0.2, male, width=0.4, alpha=0.5, label='male')
plt.legend();
Given a set of images, use k-means to perform color quantization to reduce the number of distinct colors in each image. Specifically
In [3]:
import os
from scipy.misc import imsave
In [4]:
# part 1
def quantize(img, out_colors):
# get image dimensions
imheight = img.shape[0]
imwidth = img.shape[1]
if len(img.shape) == 3:
immulti = img.shape[2]
else:
immulti = 1
# reshape image
img_reshape = img.ravel().reshape(imwidth*imheight, immulti)
# fit kmeans
kmeans = KMeans(n_clusters=out_colors)
kmeans.fit(img_reshape)
# reshape image
quantized_img = kmeans.labels_.reshape(imheight, imwidth)
# use cluster centers instead of original colors
quantized_img = np.array([kmeans.cluster_centers_[i] for i in quantized_img], dtype='uint8')
# reshape if grayscale
if immulti == 1:
quantized_img = quantized_img.reshape(imheight, imwidth)
return quantized_img
In [5]:
img = imread('./Quantization/images/bw.jpg')
q_img = quantize(img, 64)
In [6]:
plt.imshow(img);
In [7]:
plt.imshow(q_img);
In [8]:
# part 2
def batch_reduce(folder, file_list, out_colors_list):
# create folders
if not os.path.exists('./{}/original'.format(folder)):
os.makedirs('./{}/original'.format(folder))
if not os.path.exists('./{}/quantized'.format(folder)):
os.makedirs('./{}/quantized'.format(folder))
for i in range(len(file_list)):
# quantize image
img = imread('./{}/{}'.format(folder, file_list[i]))
q_img = quantize(img, out_colors_list[i])
print('{} image(s) processed: {}'.format(i+1, file_list[i]))
# save images
imsave('./{}/original/{}.png'.format(folder, file_list[i]), img)
imsave('./{}/quantized/{}.png'.format(folder, file_list[i]), q_img)
In [9]:
# part 3
batch_reduce('Quantization/images', ['bw.jpg', 'coffee.jpg', 'flowers.jpg', 'grayscale.jpg'], [32, 32, 32, 32])
Create a simple custom pipeline that does the following with the survey data:
In [32]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
In [33]:
survey = responses.drop(['label16', 'label2'], axis=1)
In [34]:
# split
Xtrain, Xtest = train_test_split(survey, train_size=0.67, random_state=125)
In [35]:
# PCA
pca = PCA()
pca.fit(Xtrain)
Out[35]:
In [36]:
plt.figure(figsize=(16, 8))
plt.plot(np.arange(len(pca.explained_variance_ratio_)), np.cumsum(pca.explained_variance_ratio_))
plt.axhline(y=0.9, linestyle='--');
In [37]:
explained_var = np.cumsum(pca.explained_variance_ratio_)
n_comp = len(explained_var[explained_var < 0.9]) + 1
pca.set_params(n_components=n_comp)
Xtrain_red = pca.fit_transform(Xtrain)
In [38]:
# kmeans
kmeans = KMeans(n_clusters=16, random_state=147)
kmeans.fit(Xtrain_red)
ytrain = kmeans.predict(Xtrain_red)
In [39]:
# forest
rf = RandomForestClassifier(random_state=123)
rf.fit(Xtrain_red, ytrain)
ytrain_pred = rf.predict(Xtrain_red)
In [40]:
# score
print(classification_report(ytrain, ytrain_pred))
In [41]:
# try it out on test set
Xtest_red = pca.transform(Xtest)
ytest = kmeans.predict(Xtest_red)
ytest_pred = rf.predict(Xtest_red)
print(classification_report(ytest, ytest_pred))